﻿/*学号=YJD201702032716 JH-C, G++ Raspberry-PI */
#include <vector>
#include <cstdio>
#include <memory>
#include <memory.h>
#include <omp.h>
#include <string.h>
#include <mutex>
#include <set>
#include <cmath>
#include <QCoreApplication>
#include <QDir>
FILE * fpTxt  = nullptr;
unsigned long long old_nn = 0;
inline void output_txt(unsigned long long count,unsigned long long n)
{
	char buf_txtfm[256] = {0};
	if ((old_nn/100000000 != n/100000000) || !fpTxt)
	{
		QDir dir(".");
		QString strDir = QString().asprintf("./prime_res/%016lld",n/100000000000);
		dir.mkpath(strDir);
		old_nn = n;
		sprintf (buf_txtfm,"./prime_res/%016lld/primer_%016lld.txt",n/100000000000,n/100000000);
		if (fpTxt)
			fclose(fpTxt);
		fpTxt = fopen64(buf_txtfm,"a+");
		if (!fpTxt)
			printf("Fopen %s failed!\n",buf_txtfm);
		else {
			printf("Fopen %s succeed!\n",buf_txtfm);
		}
		fseeko64(fpTxt,0,SEEK_END);
	}
	if (fpTxt)
		fprintf(fpTxt,"%lld\t%lld\n",count,n);
}

int main(int argc, char * argv[])
{
	QCoreApplication app(argc,argv);
	QDir dir(".");
	dir.mkpath("./prime_res");
	//开辟一个二进制的文件作为缓存，一个文本文件作为结果
	FILE * fp = fopen64("./prime_res/primer.dat","ab+");
	if (!fp)
		return -1;
	//要接着上次计算，读取最新的结果
	fseeko64(fp,0,SEEK_END);
	unsigned long long fsz = ftello64(fp);
	unsigned long long test = 2;
	//首次计算，把2写入，作为第一个素数
	if (fsz<sizeof(unsigned long long))
	{
		fseeko64(fp,0,SEEK_SET);
		fwrite(&test,sizeof(unsigned long long),1,fp);
		output_txt(1,2);
	}
	else
		//否则，读取上次的进度
	{
		fseeko64(fp,fsz - sizeof(unsigned long long),SEEK_SET);
		fread(&test,sizeof(unsigned long long),1,fp);
	}

	printf ("Calc primers from %lld\n",test);
	//设置下一个测试的值
	if (test==2)
		test = 3;
	else
		test += 2;//只测试奇数
	//OMP并行计算的初始规模。规模会随着测试数test的增大，而对数增大
	const unsigned int batch = 65536*16;
	//最大的内存读入历史素数个数，8MB。
	const int max_buf_size = 1024*1024*8;
	//获取本机的CPU计算核心个数
	const int numthreads = omp_get_num_procs();
	printf ("\nOMP Num proces = %d\n",numthreads);
	//为并行计算分配缓存，这里是C++17标准的，请注意用GCC的版本。
	std::shared_ptr<unsigned long long[][max_buf_size]> pbuffer( new unsigned long long [numthreads][max_buf_size]);
	//为并行计算分配建议缓存大小
	std::shared_ptr<int> suggest_buf_size_arr (new int [numthreads]);
	int * suggest_buf_size_ptr = suggest_buf_size_arr.get();
	memset (suggest_buf_size_ptr,0,sizeof(int)*numthreads);
	//为并行计算分配当前测试位置偏移
	std::shared_ptr<unsigned long long> pos_arr (new unsigned long long [numthreads]);
	unsigned long long * file_pos = pos_arr.get();
	for (int i=0;i<numthreads;++i)
		file_pos[i] = 1;
	//文件互斥
	std::mutex m_f;
	//初始的0~65536，采用单线程，后续的进行并行化。
	int sthr_fpos = 1;
	int rprimers_sthd = 0;
	while(1)
	{
		//单线程
		if (test*16<batch)
		{
			int & suggest_buf_size = suggest_buf_size_ptr[0];
			unsigned long long * buffer = pbuffer.get()[0];
			if (suggest_buf_size<2)
				suggest_buf_size = 2;
			if (suggest_buf_size>=max_buf_size)
				suggest_buf_size = max_buf_size;

			if (sthr_fpos!=0)
			{
				fseeko64(fp,0,SEEK_SET);
				rprimers_sthd = 	fread(buffer,sizeof(unsigned long long),suggest_buf_size,fp);
				sthr_fpos = 0;
			}
			bool ex_test = false;
			bool test_curr_ok = true;
			while (test_curr_ok && rprimers_sthd)
			{
				for (int i=0;i<rprimers_sthd && test_curr_ok;++i)
				{
					if (buffer[i] * buffer[i]>test)
					{
						ex_test = true;
						break;
					}
					if (test % buffer[i]==0)
						test_curr_ok = false;
				}
				if (ex_test)
					break;
				if (test_curr_ok)
				{
					fseeko64(fp,sthr_fpos,SEEK_SET);
					rprimers_sthd = fread(buffer,sizeof(unsigned long long),suggest_buf_size,fp);
					sthr_fpos = ftello64(fp);
					suggest_buf_size += 32;
				}
			}
			if (!test_curr_ok)
			{
				test+=2;
				continue;
			}

			printf ("Single Thread, Primer: %lld  \r", test);
			fseeko64(fp,0,SEEK_END);			
			fwrite(&test,sizeof(unsigned long long),1,fp);
			unsigned long long ttl = ftello64(fp)/8;
			output_txt(ttl,test);
			fflush(fp);
			test+=2;
		}
		//多线程
		else
		{
			//用于存储新增素数的集合，顺便可以按照大小顺序排序。
			std::set<unsigned long long> set_results;
			//计算并行规模
			unsigned long long rbat = (unsigned long long)(log(test)/log(2) * (batch*1.0/(log(batch)/log(2))))/2*2;
			//控制并行规模
			if (rbat>=1024*1024*16)
				rbat = 1024*1024*16;
			if (rbat<batch)
				rbat = batch;
			printf ("Batch %lld ", rbat);
			clock_t start_time = clock();
#pragma omp parallel for
			for (unsigned long long t = test; t<test+rbat;t+=2)
			{
				const int omp_n = omp_get_thread_num();
				int & suggest_buf_size = suggest_buf_size_ptr[omp_n];
				unsigned long long * buffer = pbuffer.get()[omp_n];
				if (suggest_buf_size<16)
					suggest_buf_size = 16;
				if (suggest_buf_size>=max_buf_size)
					suggest_buf_size = max_buf_size;
				int rprimers = 0;
				//需要重新读取缓存。由于绝大部分合数都在第一波缓存中被淘汰，所以这个读取行为发生的不频繁。
				if (file_pos[omp_n]!=0)
				{
					m_f.lock();
					fseeko64(fp,0,SEEK_SET);
					rprimers = 	fread(buffer,sizeof(unsigned long long),suggest_buf_size,fp);
					file_pos[omp_n] = 0;
					m_f.unlock();
				}
				else
					rprimers = suggest_buf_size;


				bool ex_test = false;
				bool test_curr_ok = true;
				while (test_curr_ok && rprimers)
				{
					for (int i=0;i<rprimers;++i)
					{
						if (!test_curr_ok)
							break;
						if (buffer[i] * buffer[i]>t)
						{
							ex_test = true;
							break;
						}
						if (t % buffer[i]==0)
							test_curr_ok = false;
					}
					if (ex_test)
						break;
					if (test_curr_ok)
					{
						m_f.lock();
						fseeko64(fp,file_pos[omp_n],SEEK_SET);
						rprimers = fread(buffer,sizeof(unsigned long long),suggest_buf_size,fp);
						file_pos[omp_n] = ftello64(fp);
						suggest_buf_size += 32;
						m_f.unlock();
					}
				}
				if (!test_curr_ok)
					continue;
#pragma omp critical (ccres)
				{
					set_results.insert(t);
				}

			}
			for (int i=0;i<numthreads;++i)
				file_pos[i] = 1;
			test += rbat;

			//输出
			if (set_results.size())
			{
				printf (",%d primes, last Primer: %lld ,", set_results.size(),*set_results.rbegin());
				fseeko64(fp,0,SEEK_END);
				unsigned long long ttl = ftello64(fp)/8;
				for(auto p = set_results.begin();p!=set_results.end();++p)
				{
					fwrite(&(*p),sizeof(unsigned long long),1,fp);
					output_txt(++ttl,*p);
				}
				fflush(fpTxt);
				fflush(fp);
				clock_t end_time = clock();
				clock_t spent = end_time - start_time;
				if (spent)
				{
					printf (" %lld / s", rbat * CLOCKS_PER_SEC/ (spent));
				}
				printf ("\n");
			}
			else {
				printf (", empty\n");
			}
		}


	}
	fclose(fpTxt);
	fclose(fp);
	return 0;
}
